from pyspark import StorageLevel
from pyspark.sql.session import SparkSession

from pyspark.sql.functions import *
from setuptools.command.alias import alias

# 1、创建spark sql执行环境
spark = SparkSession \
    .builder \
    .master("local") \
    .config("spark.sql.shuffle.partitions", 1) \
    .getOrCreate()

# 2、读取数据，得到DataFrame: DF底层是RDD
# schema: 指定表结构
# sep： 指定分隔符
students_df = spark.read \
    .schema("id STRING,name STRING,age INT,sex STRING,clazz STRING") \
    .option("sep", ",") \
    .csv("../../data/students.txt")


# 对多次使用的进行缓存
students_df.cache()
students_df.persist(StorageLevel.MEMORY_ONLY)

students_df.groupBy("clazz").count().show()
students_df.groupBy("sex").count().show()

# 清除缓存
students_df.unpersist()